# Purpose: Produce regional estimates along a number of dimensions

source("constants.R")

##############################
# 0. Define helper functions #
##############################

### Function 1: A helper to read in the raw averages from Stata ###
# avgs_file = the file name that has the averages and the Ns
# sds_file = the file name that has the standard deviations
# sample = the sample for which these coefficients are defined
# coeff_prefix = generally either sy_ (for Syrian) or native_ (for natives)
clean_stata_averages <- function(avgs_file, sds_file, sample, coeff_prefix){

  raw_avgs <- read_csv(str_interp("output/${avgs_file}")) %>%
    mutate(nuts3_encoded = if_else(nuts3_encoded == "_dummy", baseline_region, nuts3_encoded)) %>%
    gather(measure, avg, -nuts3_encoded, -n) %>%
    select(nuts3=nuts3_encoded, measure, n, avg) %>%
    mutate(sample=sample)

  raw_sds <- read_csv(str_interp("output/${sds_file}")) %>%
    mutate(nuts3_encoded = if_else(nuts3_encoded == "_dummy", baseline_region, nuts3_encoded)) %>%
    gather(measure, sd, -nuts3_encoded) %>%
    select(nuts3=nuts3_encoded, measure, sd) %>%
    mutate(sample=sample)

  dummy_avgs <- read_csv(str_interp("output/${avgs_file}")) %>%
    filter(nuts3_encoded == "_dummy") %>%
    gather(measure, dummyAvg, -nuts3_encoded, -n) %>%
    select(measure, dummyAvg) %>%
    mutate(sample=sample)

  raw_avgs %>%
    inner_join(raw_sds, by=c("nuts3", "sample", "measure")) %>%
    inner_join(dummy_avgs, by=c("sample", "measure")) %>%
    rename_at(c("avg", "n", "sd", "dummyAvg"), function(x) paste0(coeff_prefix,x))
}


### Function 1A: A simple version to do the above for a split sample ###
# avgs_file = the file name that has the split sample averages
# sample = the sample for which these coefficients are defined
# coeff_prefix = generally either sy_ (for Syrian) or native_ (for natives)
clean_stata_averages_split_sample <- function(avgs_file, sample, coeff_prefix){

  raw_avgs <- read_csv(str_interp("output/${avgs_file}")) %>%
    mutate(nuts3_encoded = if_else(nuts3_encoded == "_dummy", baseline_region, nuts3_encoded)) %>%
    gather(measure, avg, -nuts3_encoded, -svar, -n)

  raw_avgs.split1 <- raw_avgs %>%
    filter(svar == 1) %>%
    select(nuts3=nuts3_encoded, measure, avg_split1=avg)

  raw_avgs.split2 <- raw_avgs %>%
    filter(svar == 2) %>%
    select(nuts3=nuts3_encoded, measure, avg_split2=avg)

  raw_avgs.split1 %>%
    left_join(raw_avgs.split2, by=c("nuts3","measure")) %>%
    mutate(sample=sample) %>%
    rename_at(c("avg_split1", "avg_split2"), function(x) paste0(coeff_prefix,x))
}


### Function 2: A helper to read in the Stata FEs ###
# file = the file name of coefficients to read
#         If using 'quick version' (which are FEs produced by demeaning rather than true dummies)
#         then 'file' should be a vector of files.
# path = the path you can find these files in
# sample = the sample for which these coefficients are defined
# coeff_prefix = generally either sy_ (for Syrian) or native_ (for natives)
# split_sample = T if this is a split sample dataset
clean_stata_nuts3_fes <- function(file, path, sample, coeff_prefix="", split_sample=F){

  # Read in the NUTS3 FEs from the Stata regressions
  fe_dat_in <- suppressWarnings(read_csv(str_interp("${path}/${file}"), skip=2))
  names(fe_dat_in) <- c("nuts3", str_remove_all(names(fe_dat_in), "\\*|\\=|\"")[c(2:ncol(fe_dat_in))])

  # These are the rows we want to pick out
  # 1 + these rows are the SEs
  nuts3_fe_row_nums <- fe_dat_in %>%
    mutate_all(~str_remove_all(.x, "\\*|\\=|\"|\\(|\\)")) %>%
    mutate(row_num = 1:n()) %>%
    filter(grepl("DE", nuts3) | grepl("_dummy", nuts3)) %>%
    select(nuts3, row_num)

  nuts3_fes_coeffs <- fe_dat_in %>%
    mutate_all(~str_remove_all(.x, "\\*|\\=|\"|\\(|\\)")) %>%
    mutate(row_num = 1:n()) %>%
    filter(row_num %in% nuts3_fe_row_nums$row_num) %>%
    select(-row_num) %>%
    gather(measure, coeff, -nuts3) %>%
    mutate(coeff = as.numeric(coeff))

  # This above will throw a warning because this SE is NA for our dummy.
  # To generate an SE for our dummy we use the minimum with each measure.
  # This is sensible because we use the MAX sized region as the baseline.
  nuts3_fes_SEs <- fe_dat_in %>%
    mutate_all(~str_remove_all(.x, "\\*|\\=|\"|\\(|\\)")) %>%
    mutate(row_num = 1:n()) %>%
    filter(row_num %in%  (nuts3_fe_row_nums$row_num + 1)) %>%
    select(-row_num) %>%
    mutate(nuts3 = nuts3_fe_row_nums$nuts3) %>%
    gather(measure, coeffSE, -nuts3) %>%
    mutate(coeffSE = as.numeric(coeffSE)) %>%
    suppressWarnings()

  nuts3_fes_SEs <- nuts3_fes_SEs %>%
    group_by(measure) %>%
    summarise(min_coeffSE = min(coeffSE, na.rm=T)) %>%
    ungroup %>%
    right_join(nuts3_fes_SEs, by="measure") %>%
    mutate(coeffSE = if_else(is.na(coeffSE), min_coeffSE, coeffSE)) %>%
    select(-min_coeffSE)

  final_dat <- nuts3_fes_coeffs %>%
    left_join(nuts3_fes_SEs, by=c("nuts3", "measure")) %>%
    mutate(sample = sample) %>%
    mutate(nuts3 = if_else(nuts3 == "_dummy", baseline_region, nuts3)) %>%
    # This ensures the ordering is correct before the renaming
    select(nuts3, measure, coeff, coeffSE, sample) %>%
    setNames(c("nuts3", "measure",
               paste0(coeff_prefix, "coeff"),
               paste0(coeff_prefix, "coeffSE"),
               "sample"))

  # If it is a split sample dataset we need to do a little more work to make the data wide
  if(split_sample){

    final_dat <- inner_join(

      final_dat %>%
        # Check whether column even or odd -- get the even split
        filter(as.numeric(str_extract(measure, "(\\d+$)")) %% 2 == 0) %>%
        mutate(measure = str_remove(measure, "...(\\d+$)")) %>%
        rename_at(c(paste0(coeff_prefix, "coeff"), paste0(coeff_prefix, "coeffSE")),
                  function(x) paste0(x,"_split1")),

      final_dat %>%
        # Check whether column even or odd -- get the odd split
        filter(as.numeric(str_extract(measure, "(\\d+$)")) %% 2 != 0) %>%
        mutate(measure = str_remove(measure, "...(\\d+$)")) %>%
        rename_at(c(paste0(coeff_prefix, "coeff"), paste0(coeff_prefix, "coeffSE")),
                  function(x) paste0(x,"_split2")),

      by = c("nuts3", "measure", "sample")

    )
  }

  final_dat
}

#####################################
# 1. Read in and clean all the data #
#####################################

##### A. The Syrian Migrants ######

## Raw measures
sy_raw <- bind_rows(

  # Overall
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns.csv",
                       sds_file="sy_migrant_nuts3_sds.csv", sample="all", coeff_prefix="sy_"),

  # Overall by time
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_quarterly_all.csv",
                       sds_file="sy_migrant_nuts3_sds_quarterly_all.csv", sample="all", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_quarterly_15_16.csv",
                       sds_file="sy_migrant_nuts3_sds_quarterly_15_16.csv", sample="all", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_quarterly_17_18.csv",
                       sds_file="sy_migrant_nuts3_sds_quarterly_17_18.csv", sample="all", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_quarterly_19_20.csv",
                       sds_file="sy_migrant_nuts3_sds_quarterly_19_20.csv", sample="all", coeff_prefix="sy_"),

  # By gender
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_isfemale_0.csv",
                       sds_file="sy_migrant_nuts3_sds_isfemale_0.csv", sample="isfemale_0", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_isfemale_1.csv",
                       sds_file="sy_migrant_nuts3_sds_isfemale_1.csv", sample="isfemale_1", coeff_prefix="sy_"),

  # By age
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_age_18_24.csv",
                       sds_file="sy_migrant_nuts3_sds_age_18_24.csv", sample="age_18_24", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_age_25_34.csv",
                       sds_file="sy_migrant_nuts3_sds_age_25_34.csv", sample="age_25_34", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_age_35_44.csv",
                       sds_file="sy_migrant_nuts3_sds_age_35_44.csv", sample="age_35_44", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_age_45_54.csv",
                       sds_file="sy_migrant_nuts3_sds_age_45_54.csv", sample="age_45_54", coeff_prefix="sy_"),
  clean_stata_averages(avgs_file="sy_migrant_nuts3_avgs_and_Ns_age_55.csv",
                       sds_file="sy_migrant_nuts3_sds_age_55.csv", sample="age_55", coeff_prefix="sy_")

)

## Raw measures using the split sample
sy_raw_split_sample <- clean_stata_averages_split_sample("sy_migrant_nuts3_avgs_and_Ns_split_sample.csv", "all", "sy_")


## Coefficients using the full sample
sy_coeff_full_sample <- bind_rows(
  # All users - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary.csv", "output",
                        sample="all", coeff_prefix="sy_"),
  # All users - other outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_other_fr.csv", "output",
                        sample="all", coeff_prefix="sy_"),

  # All users - temporal outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_quarterly.csv", "output",
                        sample="all", coeff_prefix="sy_"),

  # Is female == 0 - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_isfemale_0.csv", "output",
                        sample="isfemale_0", coeff_prefix="sy_"),
  # Is female == 1 - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_isfemale_1.csv", "output",
                        sample="isfemale_1", coeff_prefix="sy_"),

  # Age 18-24 - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_age_18_24.csv", "output",
                        sample="age_18_24", coeff_prefix="sy_"),
  # Age 25-34 - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_age_25_34.csv", "output",
                        sample="age_25_34", coeff_prefix="sy_"),
  # Age 35-44 - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_age_35_44.csv", "output",
                        sample="age_35_44", coeff_prefix="sy_"),
  # Age 45-54 - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_age_45_54.csv", "output",
                        sample="age_45_54", coeff_prefix="sy_"),
  # Age 55+ - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_age_55.csv", "output",
                        sample="age_55", coeff_prefix="sy_"),
)

## Coefficients using the split sample
sy_coeff_split <- bind_rows(
  # All users - main outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_primary_split_sample.csv", "output",
                        sample="all", coeff_prefix="sy_", split_sample=T),
  # All users - other outcomes
  clean_stata_nuts3_fes("nuts3_FEs_sy_migrant_other_fr_split_sample.csv", "output",
                        sample="all", coeff_prefix="sy_", split_sample=T)
)


##### B. The Natives #####

## Raw averages
native_raw <- bind_rows(
  clean_stata_averages(avgs_file="natives_nuts3_avgs_and_Ns.csv",
                       sds_file="natives_nuts3_sds.csv", sample="all", coeff_prefix="native_"),
  clean_stata_averages(avgs_file="natives_proasyl_nuts3_avgs_and_Ns.csv",
                       sds_file="natives_proasyl_nuts3_sds.csv", sample="all", coeff_prefix="native_")
)


###### C. The other migrants #######

## Raw averages
recent_t5_raw <- clean_stata_averages(avgs_file="recent_t5_migrant_nuts3_avgs_and_Ns.csv",
                                      sds_file="recent_t5_migrant_nuts3_sds.csv", sample="all", coeff_prefix="t5_")

###### COMBINE EVERYTHING #######

dat_in_full <- sy_raw %>%
  full_join(sy_raw_split_sample, by=c("nuts3", "sample", "measure")) %>%
  full_join(native_raw, by=c("nuts3", "sample", "measure")) %>%
  full_join(recent_t5_raw, by=c("nuts3", "sample", "measure")) %>%
  # This cleans up the order
  select(nuts3, sample, sy_n, native_n, t5_n,
         measure,
         sy_avg, sy_sd, sy_dummyAvg, sy_avg_split1, sy_avg_split2,
         native_avg, native_sd, native_dummyAvg,
         t5_avg, t5_sd, t5_dummyAvg) %>%
  left_join(sy_coeff_full_sample, by=c("nuts3", "sample", "measure")) %>%
  left_join(sy_coeff_split, by=c("nuts3", "sample", "measure"))



########################################################
# 2. Make the averages residualized for Facebook usage #
########################################################

###### A. Build data for residualization on intensive margin #######
# See the README for instructions on how to access this data
residualize_intensive <- read_csv("input/regional_integration_resid_data.csv")

###### B. Build data for residualization on extensive margin #######
# See the README for instructions on how to access this data
native_pop_fb <- read_csv("input/regional_integration_native_pop.csv")

# See the README for instructions on how to access these data
kreis_xw <- read_csv("input/nuts3_DE_xw.csv")
kreis_demos <- read_csv("input/kreis_total_pop_by_age_gender.csv")

kreis_to_exclude <- unique(filter(kreis_demos, year == 2019, is.na(frac_syr_tot))$ags)

residualize_extensive <- kreis_demos %>%
  filter(year == 2019) %>%
  left_join(kreis_xw, by=c("ags"="ID")) %>%
  mutate(pop_native = pop_tot * (1-frac_for_tot)) %>%
  select(nuts3=NUTS3, ags, pop_tot, pop_native) %>%
  left_join(native_pop_fb, by=c("nuts3"="curr_nuts3")) %>%
  mutate(share_natives_on_fb = n_native_fb/pop_native)

# Impute the bad data with overall mean
impute_val <- mean(filter(residualize_extensive, !ags %in% kreis_to_exclude)$share_natives_on_fb)
residualize_extensive <- residualize_extensive %>%
  mutate(share_natives_on_fb = if_else(ags %in% kreis_to_exclude, impute_val, share_natives_on_fb)) %>%
  select(nuts3, share_natives_on_fb)


###### C. Now do the residualization in the loop #######

dat.w_controls <- dat_in_full %>%
  left_join(residualize_intensive, by="nuts3") %>%
  left_join(residualize_extensive, by="nuts3")

samples <- unique(dat.w_controls$sample)
measures <- unique(dat.w_controls$measure)
lhs_to_use <- c("sy_avg", "sy_avg_split1", "sy_avg_split2", "native_avg", "t5_avg")

dat_in_full.w_resid <- NULL
# Loop through each sample and measure combination
for(curr_sample in samples){
  for(curr_measure in measures){

    curr_dat <- filter(dat.w_controls, sample == curr_sample & measure == curr_measure)

    # Sometimes there won't be this sample/measure
    if(nrow(curr_dat) == 0){next}

    # Loop through all LHS
    for(curr_lhs in lhs_to_use){

      # Sometimes this LHS for this sample and measure will be all NA
      if(sum(!is.na(curr_dat[[curr_lhs]])) == 0){
        curr_dat[[paste0(curr_lhs, "_resid")]] <- NA
      }
      # Otherwise generate the residualized measure
      else{

        # Add an additional control if we are residualizing on groups
        if(curr_measure == "n_lcl_50n_grps"){
          curr_controls <- "indiv_l1080_cntrl + share_natives_on_fb + avg_total_grps"
        }
        # Otherwise just use the baseline
        else{
          curr_controls <- "indiv_l1080_cntrl + share_natives_on_fb"
        }

        curr_dat[[paste0(curr_lhs, "_resid")]] <-
          mean(curr_dat[[curr_lhs]]) +
          resid(lm(formula=paste(curr_lhs, curr_controls, sep=" ~ "), data=curr_dat))
      }

    }

    # Finally add this dataset with the new residualized outcome to the final dataset
    if(is.null(dat_in_full.w_resid)){dat_in_full.w_resid <- curr_dat}
    else{dat_in_full.w_resid <- bind_rows(dat_in_full.w_resid, curr_dat)}

  }
}



####################################
# 3. Calc the reliability measures #
####################################

# Get overall reliabilities
overall_reliabilities <- dat_in_full.w_resid %>%
  # The standard errors for the Syrian measures
  mutate(sy_coeffSE_sqr = sy_coeffSE^2) %>%
  mutate(sy_avgSE_sqr = (sy_sd/(sqrt(sy_n)))^2) %>%
  # The standard errors for the Native measures
  mutate(native_avgSE_sqr = (native_sd/(sqrt(native_n)))^2) %>%
  # The standard errors for the other T5 Refugee measures
  mutate(t5_avgSE_sqr = (t5_sd/(sqrt(t5_n)))^2) %>%
  group_by(sample, measure) %>%
  summarise(

    # Syrian split summary measure
    sy_SD_of_coeff_split1 = sqrt(wtd.var(sy_coeff_split1, w=sy_n)),
    sy_SD_of_coeff_split2 = sqrt(wtd.var(sy_coeff_split2, w=sy_n)),
    sy_SD_of_avg_split1 = sqrt(wtd.var(sy_avg_split1, w=sy_n)),
    sy_SD_of_avg_split2 = sqrt(wtd.var(sy_avg_split2, w=sy_n)),
    sy_SD_of_avg_split1_resid = sqrt(wtd.var(sy_avg_split1_resid, w=sy_n)),
    sy_SD_of_avg_split2_resid = sqrt(wtd.var(sy_avg_split2_resid, w=sy_n)),
    sy_corr_of_split_coeffs = wtd.cor(sy_coeff_split1, sy_coeff_split2, w=sy_n)[1],
    sy_corr_of_split_avgs = wtd.cor(sy_avg_split1, sy_avg_split2, w=sy_n)[1],
    sy_corr_of_split_avgs_resid = wtd.cor(sy_avg_split1_resid, sy_avg_split2_resid, w=sy_n)[1],

    # Syrian other summary measures
    sy_SD_of_coeffs = sqrt(wtd.var(sy_coeff, w=sy_n)),
    sy_SD_of_avgs = sqrt(wtd.var(sy_avg, w=sy_n)),
    sy_SD_of_avgs_resid = sqrt(wtd.var(sy_avg_resid, w=sy_n)),
    sy_mean_of_coeff = wtd.mean(sy_coeff, w=sy_n),
    sy_mean_of_avg = wtd.mean(sy_avg, w=sy_n),
    sy_mean_of_coeffSE_sqr = wtd.mean(sy_coeffSE_sqr, w=sy_n),
    sy_mean_of_avgSE_sqr = wtd.mean(sy_avgSE_sqr, w=sy_n),

    # Native summary measures
    # native_SD_of_coeffs = sqrt(wtd.var(native_coeff, w=native_n)),
    native_SD_of_avgs = sqrt(wtd.var(native_avg, w=native_n)),
    # native_mean_of_coeff = wtd.mean(native_coeff, w=native_n),
    native_mean_of_avg = wtd.mean(native_avg, w=native_n),
    # native_mean_of_coeffSE_sqr = wtd.mean(native_coeffSE_sqr, w=native_n),
    native_mean_of_avgSE_sqr = wtd.mean(native_avgSE_sqr, w=native_n),

    # Other t5 summary measures
    # t5_SD_of_coeffs = sqrt(wtd.var(t5_coeff, w=t5_n)),
    t5_SD_of_avgs = sqrt(wtd.var(t5_avg, w=t5_n)),
    # t5_mean_of_coeff = wtd.mean(t5_coeff, w=t5_n),
    t5_mean_of_avg = wtd.mean(t5_avg, w=t5_n),
    # t5_mean_of_coeffSE_sqr = wtd.mean(t5_coeffSE_sqr, w=t5_n),
    t5_mean_of_avgSE_sqr = wtd.mean(t5_avgSE_sqr, w=t5_n)

  ) %>%
  ungroup %>%
  # Calculate each of the reliabilities
  mutate(
    sy_reliability_split_coeffs = sy_corr_of_split_coeffs * ((sy_SD_of_coeff_split1 * sy_SD_of_coeff_split2) / (sy_SD_of_coeffs^2)),
    sy_reliability_split_avgs = sy_corr_of_split_avgs * ((sy_SD_of_avg_split1 * sy_SD_of_avg_split2) / (sy_SD_of_avgs^2)),
    sy_reliability_split_avgs_resid = sy_corr_of_split_avgs_resid * ((sy_SD_of_avg_split1_resid * sy_SD_of_avg_split2_resid) / (sy_SD_of_avgs_resid^2)),
    sy_reliability_se_coeffs = ((sy_SD_of_coeffs^2) - sy_mean_of_coeffSE_sqr) / (sy_SD_of_coeffs^2),
    sy_reliability_se_avgs = ((sy_SD_of_avgs^2) - sy_mean_of_avgSE_sqr) / (sy_SD_of_avgs^2),
    native_reliability_se_avgs = ((native_SD_of_avgs^2) - native_mean_of_avgSE_sqr) / (native_SD_of_avgs^2),
    t5_reliability_se_avgs = ((t5_SD_of_avgs^2) - t5_mean_of_avgSE_sqr) / (t5_SD_of_avgs^2))

# Write the relabilities to file
overall_reliabilities %>%
  filter(sample == "all") %>%
  select(sample, measure,
         sy_reliability_se_coeffs, sy_reliability_se_avgs,
         sy_reliability_split_coeffs, sy_reliability_split_avgs, sy_reliability_split_avgs_resid,
         native_reliability_se_avgs,
         t5_reliability_se_avgs) %>%
  write_csv("output/overall_reliabilities.csv")


# Make the adjusted coefficient levels
dat <- dat_in_full.w_resid %>%
  mutate(sy_coeff_adj = sy_coeff + sy_dummyAvg)

write_csv(dat, "output/regional_measures.csv")



################################
### 3. Regional correlation ###
################################

# Make a dataset with just the Syrian migrant populations
fb_sy_pop <- dat %>%
  filter(sample == "all", measure == "n_frnd_nat_lcl") %>%
  select(nuts3, sy_n) %>% distinct()

decomp_sy_migrants <- dat %>%
  filter(sample == "all") %>%
  select(nuts3, measure, sy_avg_resid) %>%
  spread(key=measure, value=sy_avg_resid) %>%
  select(nuts3, n_frnd_nat_lcl, n_lcl_50n_grps, produ_any_de, n_frnd_sy_lcl) %>%
  inner_join(fb_sy_pop, by="nuts3")

decomp_natives <- dat %>%
  filter(sample == "all") %>%
  select(nuts3, measure, native_avg_resid) %>%
  spread(key=measure, value=native_avg_resid) %>%
  select(nuts3, n_frnd_nat_lcl, n_lcl_50n_grps)

nuts3_fes_wide.decomp <- decomp_sy_migrants %>%
  inner_join(decomp_natives, by="nuts3", suffix=c("", "_NATIVE")) %>%
  mutate(
    relative_frnds = n_frnd_nat_lcl/n_frnd_nat_lcl_NATIVE,
    relative_grps = n_lcl_50n_grps/n_lcl_50n_grps_NATIVE) %>%
  select(-nuts3)

# Generate the weighted correlations
nuts3_corrs.decomp <- wtd.cors(
  select(nuts3_fes_wide.decomp, -sy_n),
  weight = nuts3_fes_wide.decomp$sy_n) %>%
  round(3) %>%
  as_tibble(rownames="measure")


### Generate the signal correlations ###

# Here we make a conservative assumption that the reliability
# of the native measures are 1. This means for the relative friending terms
# the only source of reliability is the numerator. We replace with that here.
avgs.reliabilities.decomp <- bind_rows(

  overall_reliabilities %>%
    filter(sample == "all") %>%
    select(measure, sy_reliability_se_avgs),

  # For the relative friending measures, we use only the reliability
  # from the numerator. General friending has essentially no sampling error.
  overall_reliabilities %>%
    filter(sample == "all") %>%
    select(measure, sy_reliability_se_avgs) %>%
    mutate(measure = case_when(
      measure == "n_frnd_nat_lcl" ~ "relative_frnds",
      measure == "n_lcl_50n_grps" ~ "relative_grps"
    )) %>%
    filter(!is.na(measure))
)

# Generate final correlations
nuts3_signal_corrs.decomp <- nuts3_corrs.decomp %>%
  gather(measure2, corr, -measure) %>%
  left_join(avgs.reliabilities.decomp, by="measure") %>%
  rename(reliability = sy_reliability_se_avgs) %>%
  left_join(avgs.reliabilities.decomp, by=c("measure2"="measure")) %>%
  rename(reliability2 = sy_reliability_se_avgs) %>%
  mutate_if(is.numeric, coalesce, 1) %>%
  mutate(signal_corr = (1/sqrt(reliability)) * (1/sqrt(reliability2)) * corr) %>%
  select(measure, measure2, signal_corr) %>%
  spread(measure2, signal_corr)

# Clean up for the final output
nums <- paste0("(", 1:(ncol(nuts3_corrs.decomp)-1), ")")

# Produce the final correlations table
nuts3_corrs.decomp <- nuts3_corrs.decomp %>%
  select(measure, n_frnd_nat_lcl, produ_any_de, n_lcl_50n_grps, n_frnd_sy_lcl,
         n_frnd_nat_lcl_NATIVE, n_lcl_50n_grps_NATIVE, relative_frnds, relative_grps) %>%
  mutate(measure = case_when(
    measure == "n_frnd_nat_lcl" ~ "(1) SY Migrants - N Local Native Friends",
    measure == "produ_any_de" ~ "(2) SY Migrants - Produced Content in DE",
    measure == "n_lcl_50n_grps" ~ "(3) SY Migrants - N Local Native Groups",
    measure == "n_frnd_sy_lcl" ~ "(4) SY Migrants - N Local SY Friends",
    measure == "n_frnd_nat_lcl_NATIVE" ~ "(5) General Friendliness - Friending",
    measure == "n_lcl_50n_grps_NATIVE" ~ "(6) General Friendliness - Groups",
    measure == "relative_frnds" ~ "(7) Relative Connectedness - Friending",
    measure == "relative_grps" ~ "(8) Relative Connectedness - Groups"
  )) %>%
  arrange(measure)
names(nuts3_corrs.decomp) <- c("measure", nums)

write_csv(nuts3_corrs.decomp, "output/regional_corrs.csv")

# Produce the final signal correlations table
nuts3_signal_corrs.decomp <- nuts3_signal_corrs.decomp %>%
  select(measure, n_frnd_nat_lcl, produ_any_de, n_lcl_50n_grps, n_frnd_sy_lcl,
         n_frnd_nat_lcl_NATIVE, n_lcl_50n_grps_NATIVE, relative_frnds, relative_grps) %>%
  mutate(measure = case_when(
    measure == "n_frnd_nat_lcl" ~ "(1) SY Migrants - N Local Native Friends",
    measure == "produ_any_de" ~ "(2) SY Migrants - Produced Content in DE",
    measure == "n_lcl_50n_grps" ~ "(3) SY Migrants - N Local Native Groups",
    measure == "n_frnd_sy_lcl" ~ "(4) SY Migrants - N Local SY Friends",
    measure == "n_frnd_nat_lcl_NATIVE" ~ "(5) General Friendliness - Friending",
    measure == "n_lcl_50n_grps_NATIVE" ~ "(6) General Friendliness - Groups",
    measure == "relative_frnds" ~ "(7) Relative Connectedness - Friending",
    measure == "relative_grps" ~ "(8) Relative Connectedness - Groups"
  )) %>%
  arrange(measure)
names(nuts3_signal_corrs.decomp) <- c("measure", nums)

write_csv(nuts3_signal_corrs.decomp, "output/regional_corrs_SIGNAL.csv")

pal5 <-  (brewer.pal(n=5, name="YlGnBu"))

regional_decomp_scatter_dat <- nuts3_fes_wide.decomp %>%
  mutate(`Integration %-tile` = paste0(as.character((ntile(n_frnd_nat_lcl, 5) - 1) * 20), "-",
                                       as.character(ntile(n_frnd_nat_lcl, 5) * 20)))

wtd_cor <- wtd.cor(x=regional_decomp_scatter_dat$relative_frnds,
                   y=regional_decomp_scatter_dat$n_frnd_nat_lcl_NATIVE,
                   weight=regional_decomp_scatter_dat$sy_n)

### This produces Figure 4, panel c ###

ggplot(regional_decomp_scatter_dat, aes(x=relative_frnds, y=n_frnd_nat_lcl_NATIVE)) +
  geom_point(aes(size=sy_n, fill=`Integration %-tile`), pch=21, stroke=0.25, col = "#999999", alpha=0.6) +
  scale_size_continuous(range = c(0.5, 20)) +
  scale_fill_manual(values = pal5) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = sy_n), col="gray") +
  theme_classic() +
  guides(size="none") +
  guides(size="none",
         fill = guide_legend(override.aes = list(alpha = 1, size=7))) +
  geom_text(x=max(regional_decomp_scatter_dat$relative_frnds) -0.03,
            y=max(regional_decomp_scatter_dat$n_frnd_nat_lcl_NATIVE) - 25,
            label=str_interp("Weighted Correlation = ${format(round(wtd_cor[[1]], 3), nsmall=3)} (${format(round(wtd_cor[[2]], 3), nsmall=3)})"),
            col="#595959",
            size=3.2) +
  labs(x = "Relative Friending", y = "General Friendliness")

ggsave("output/regional_decomp_scatter.png", last_plot(), width=7, height=4)



#######################
## 5. Decomposition ##
######################

# Make the data to do the decomposition R2 regressions
reg_dat <- dat %>%
  filter(sample == "all", measure == "n_frnd_nat_lcl") %>%
  mutate(rel_frnd = sy_avg_resid/native_avg_resid) %>%
  select(nuts3, sy_avg_resid, native_avg_resid, rel_frnd)

tibble(
  measure = c("gf", "rf", "both"),
  r2 = c(
    summary(lm(log(sy_avg_resid) ~ log(native_avg_resid), data=reg_dat))$r.squared,
    summary(lm(log(sy_avg_resid) ~ log(rel_frnd), data=reg_dat))$r.squared,
    summary(lm(log(sy_avg_resid) ~ log(rel_frnd) + log(native_avg_resid), data=reg_dat))$r.squared)
) %>% write_csv("output/decomp_rsq_analysis.csv")

# Summarize the data by quintile
quintile_dat <- dat %>%
  filter(sample == "all", measure == "n_frnd_nat_lcl") %>%
  mutate(rel_frnd = sy_avg_resid/native_avg_resid) %>%
  select(nuts3, sy_avg_resid, native_avg_resid, rel_frnd) %>%
  mutate(sy_avg_resid_ntile = ntile(sy_avg_resid, 5)) %>%
  group_by(sy_avg_resid_ntile) %>%
  summarise(
    sy_avg_resid = wtd.mean(sy_avg_resid),
    native_avg_resid = wtd.mean(native_avg_resid),
    rel_frnd = wtd.mean(rel_frnd)) %>%
  ungroup()


# Get each of the quintile averages for all 3 measures
bottom_integration <- filter(quintile_dat, sy_avg_resid_ntile == 1)$sy_avg_resid
top_integration <- filter(quintile_dat, sy_avg_resid_ntile == 5)$sy_avg_resid

bottom_gf <- filter(quintile_dat, sy_avg_resid_ntile == 1)$native_avg_resid
top_gf <- filter(quintile_dat, sy_avg_resid_ntile == 5)$native_avg_resid

bottom_rf <- filter(quintile_dat, sy_avg_resid_ntile == 1)$rel_frnd
top_rf <- filter(quintile_dat, sy_avg_resid_ntile == 5)$rel_frnd

measure_levels <-
  c("Empirically\nObserved\nLow-Int",
    "No Covariance",
    "No Covariance\n+ High GF",
    "No Covariance\n+ High RF",
    "Empirically\nObserved\nHigh-Int")

tibble(
  measures = measure_levels,
  values = c(bottom_integration,
             bottom_gf * bottom_rf,
             top_gf * bottom_rf,
             bottom_gf * top_rf,
             top_integration)) %>%
  mutate(measures = factor(measures, levels = measure_levels)) %>%
  ggplot(dat=., aes(x=measures, y=values)) +
  geom_bar(stat="identity", width=0.6) +
  theme_classic() +
  geom_text(aes(label = round(values, 2)), vjust = -0.2) +
  labs(x="", y="Regional SY Migrant Friending Integration")

ggsave("output/decomp_analysis.png", last_plot(), width=6, height=5)



###################################
## 6. Robustness Check Analyses ##
##################################

# See the README for instructions on how to access this data
sy_pop_fb <- read_csv("input/regional_integration_sy_pop.csv")

rhs_sy_robustness_extensive <- kreis_demos %>%
  filter(year == 2019) %>%
  left_join(kreis_xw, by=c("ags"="ID")) %>%
  mutate(pop_sy = pop_syr_tot) %>%
  select(nuts3=NUTS3, ags, pop_sy) %>%
  left_join(sy_pop_fb, by=c("nuts3"="curr_nuts3")) %>%
  mutate(share_sy_on_fb = n_sy_fb/pop_sy)

# Impute the bad data with overall mean
impute_val <- mean(filter(rhs_sy_robustness_extensive, !ags %in% kreis_to_exclude)$share_sy_on_fb)
rhs_sy_robustness_extensive <- rhs_sy_robustness_extensive %>%
  mutate(share_sy_on_fb = if_else(ags %in% kreis_to_exclude, impute_val, share_sy_on_fb)) %>%
  select(nuts3, share_sy_on_fb)


### Get the other RHS measures ###
# See the README for instructions on how to access these data
rhs_sy_robustness_indiv <- read_csv("input/regional_integration_sy_robustness.csv")
rhs_native_robustness_indiv <- read_csv("input/regional_integration_native_robustness.csv")


### Get the LHS ###
lhs_robustness <- dat %>%
  filter(sample == "all", measure == "n_frnd_nat_lcl") %>%
  select(nuts3, measure, sy_avg_resid, native_avg_resid) %>%
  mutate(relative_friending = sy_avg_resid/native_avg_resid)


### Make the full robustness check data ###
robustness_sy_full <- lhs_robustness %>%
  inner_join(rhs_sy_robustness_indiv, by=c("nuts3"="curr_nuts3")) %>%
  inner_join(rhs_sy_robustness_extensive, by="nuts3")

robustness_native_full <- lhs_robustness %>%
  inner_join(rhs_native_robustness_indiv, by=c("nuts3"="curr_nuts3")) %>%
  inner_join(residualize_extensive, by="nuts3")


rhs_sy_observables <- c("age_brkt_18_24 + age_brkt_25_34 + age_brkt_35_44 + age_brkt_45_54 + age_brkt_55_plus + is_female + quarters_since_de")
rhs_sy_usage <- c("indiv_l1080_cntrl + share_sy_on_fb")
rhs_native_observables <- c("age_brkt_18_24 + age_brkt_25_34 + age_brkt_35_44 + age_brkt_45_54 + age_brkt_55_plus + is_female + has_college")
rhs_native_usage <- c("indiv_l1080_cntrl + share_natives_on_fb")

#### Observables #####
gen.rsq <- function(rhs, data){
  c(
    summary(lm(formula=paste("sy_avg_resid", rhs, sep=" ~ "), data=data))$adj.r.squared,
    summary(lm(formula=paste("relative_friending", rhs, sep=" ~ "), data=data))$adj.r.squared,
    summary(lm(formula=paste("native_avg_resid", rhs, sep=" ~ "), data=data))$adj.r.squared
  )
}

robustness_dat <- tibble(
  measure = c("SY Migrants - N Local Native Friends", "Relative Connectedness - Friending", "General Friendliness - Friending"),
  sy_observables = gen.rsq(rhs_sy_observables, robustness_sy_full),
  sy_usage = gen.rsq(rhs_sy_usage, robustness_sy_full),
  native_observables = gen.rsq(rhs_native_observables, robustness_native_full),
  native_usage = gen.rsq(rhs_native_usage, robustness_native_full)
)

write_csv(robustness_dat, "output/regional_robustness_rsq.csv")



####################################
## 7. SY vs T5 Integration and RF ##
####################################

# Correlate the regional other T5 migrant countries against Syrians
decomp_t5 <- dat %>%
  filter(sample == "all") %>%
  select(nuts3, measure, t5_avg_resid) %>%
  spread(key=measure, value=t5_avg_resid) %>%
  select(nuts3, n_frnd_nat_lcl)

sy_vs_t5_dat <- inner_join(
  select(decomp_sy_migrants, nuts3, sy_n_frnd_nat_lcl = n_frnd_nat_lcl, sy_n),
  select(decomp_t5, nuts3, t5_n_frnd_nat_lcl=n_frnd_nat_lcl),
  by="nuts3") %>%
  inner_join(rename(decomp_natives, native_n_frnd_nat_lcl=n_frnd_nat_lcl), by="nuts3") %>%
  mutate(
    sy_relative_frnds = sy_n_frnd_nat_lcl/native_n_frnd_nat_lcl,
    t5_relative_frnds = t5_n_frnd_nat_lcl/native_n_frnd_nat_lcl)

wtd_cor_t5_n_frnd_nat_lcl <- wtd.cor(
  sy_vs_t5_dat$sy_n_frnd_nat_lcl,
  sy_vs_t5_dat$t5_n_frnd_nat_lcl,
  sy_vs_t5_dat$sy_n)[1]

ggplot(sy_vs_t5_dat, aes(x=sy_n_frnd_nat_lcl, t5_n_frnd_nat_lcl)) +
  geom_point(aes(size=sy_n), show.legend = FALSE) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = sy_n), show.legend = FALSE) +
  theme_classic() +
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor_t5_n_frnd_nat_lcl, 3)}"),
       x = "Native Local Friending: Syrian Migrants",
       y = "Native Local Friending: Other Migrants")

ggsave("output/sy_vs_t5_n_frnd_nat_lcl.png", last_plot(), width=5, height=4)

wtd_cor_t5_relative_frnds <- wtd.cor(
  sy_vs_t5_dat$sy_relative_frnds,
  sy_vs_t5_dat$t5_relative_frnds,
  sy_vs_t5_dat$sy_n)[1]

ggplot(sy_vs_t5_dat, aes(x=sy_relative_frnds, t5_relative_frnds)) +
  geom_point(aes(size=sy_n), show.legend = FALSE) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = sy_n), show.legend = FALSE) +
  theme_classic() +
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor_t5_relative_frnds, 3)}"),
       x = "Relative Friending: Syrian Migrants",
       y = "Relative Friending: Other Migrants")

ggsave("output/sy_vs_t5_relative_frnds.png", last_plot(), width=5, height=4)
